library(tidyverse)
library(anytime)
library(gganimate)
library(gifski)
library(ggthemes)
library(sf)
library(transformr)
library(ggrepel)
#import data set in dataframe format and do a basic filter.
df_origin<-read.csv("/Users/guangjitang/Downloads/uber data/cab_rides.csv")
df <- df_origin %>% filter(!is.na(price))
weather <- read.csv("/Users/guangjitang/Downloads/uber data/weather.csv")
load("/Users/guangjitang/test1/map.rda")
#get the middle spot of the polygon for each area
for (i in 1:length(map$Name)){
  g=map$geometry[[i]]
  g=g[[1]]
  map$long_mid[i]=mean(g[,1])
  map$lat_mid[i]=mean(g[,2])
}
# get the same name with df for future join
u=unique(df$source)
map$nname=NA
count_name=0
for (i in 1:length(map$Name)) {
  for (j in 1:length(u)){
    if (grepl(u[j],map$Name[i],ignore.case = T)){
      map$nname[i]=u[j]
      count_name=count_name+1
    }
  }
}
count_name
## [1] 11
#only 11 name was added sucessfully, add the last one manully
map$nname[6]="Haymarket Square" 
#map2 saves the places of interest
map2 <- map %>% 
  filter(!is.na(nname)) %>% 
  select(nname,geometry,long_mid,lat_mid)
df_sample <- df #%>% sample_n(10000)# Sample for test only
df_plot <- df %>% 
  mutate(location=source)%>% # If care only about the weather of the source
  mutate(time=anytime(time_stamp/1000))%>% #convert time stamp to time
  mutate(time_hour=substr(time,1,10))%>%  # select only the hours
  mutate(hour = as.numeric(substr(time,12,13)))
#Categorizing each observation based on the time of the day, and add column "part_of_time" with this information
df_plot$part_of_time = "night"
df_plot$part_duration = 8
df_plot$part_of_time[df_plot$hour>5] = "morning"
df_plot$part_duration[df_plot$hour>5] = 5
df_plot$part_of_time[df_plot$hour>10] = "noon"
df_plot$part_duration[df_plot$hour>10] = 3
df_plot$part_of_time[df_plot$hour>13] = "afternoon"
df_plot$part_duration[df_plot$hour>13] = 5
df_plot$part_of_time[df_plot$hour>18] = "evening"
df_plot$part_duration[df_plot$hour>18] = 3
df_plot$part_of_time[df_plot$hour>21] = "night"
df_plot$part_duration[df_plot$hour>21] = 8
df_plot$nid=seq(1,length(df_plot$id))
weather <- weather %>% 
  mutate(time=anytime(time_stamp)) %>% #convert time stamp to time
  mutate(time_hour=substr(time,1,10)) # select only the hours
#df_plot <- merge(df_plot,weather,by=c("time_hour","location"))
df_plot <- inner_join(df_plot,weather,by=c("time_hour","location"))
df_plot <- df_plot %>%
  mutate(g_time=time.x-time.y) %>%
  mutate(g_time=abs(g_time))
df_plot <- df_plot %>%
  group_by(nid) %>%
  arrange(g_time, .by_group = TRUE) %>%
  top_n(1, g_time) %>% # now connect only to the most recent weather data 
  select(-g_time)
df_plot <- df_plot %>%
  mutate(rainy=!is.na(rain))
save(df_plot,file = "df_plot.rda")
load(file ="df_plot.rda")
df_price <- df_plot %>% filter(distance>0.5) %>% mutate(avg_price = price/distance)
# df_price is prepared for those analysis considering average price by distance(avg_price = price/distance). Cab rides whose distance under 0.5 mile is not included since the avg_price will be extremely large and meaningless. 
ggplot(df_plot,aes(distance,price,color=rainy))+facet_wrap("cab_type")+geom_point(size=1,alpha=0.3)+geom_smooth()+theme_economist()+scale_color_manual(values = c("#ffcc61","blue"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#price and distance by hour (animated)
df_plot_sample<- df_plot %>% filter(nid%%10 == 0)
ggplot(df_plot,aes(distance,price,color=cab_type))+
  transition_time(hour)+
  geom_point(data=df_plot_sample,size=1,alpha=0.5)+
  geom_smooth()+
  coord_cartesian(ylim = c(0,50))+
  labs(title = "Cab out-in in Boston area",
           caption = "In and out",fill="Out - in")+
          labs(subtitle = "Hour: {frame_time}")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

plot2=ggplot() +
  geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
  geom_sf(data = map2, mapping = aes(fill = nname),na.rm = T, show.legend = T)+
  coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+ 
  theme_map() +
  theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10)) +
  labs(title = "Areas",
       fill="Area name")
plot2

df_plot3 = df_plot %>% group_by(source) %>% count(source)
df_plot3 = df_plot3 %>% rename(source_n = n,nname=source)
df_plot3_temp = df_plot %>% group_by(destination) %>% count(destination)
df_plot3$destination_n = df_plot3_temp$n
df_plot3$difference = df_plot3$source_n -df_plot3$destination_n
map_plot3 = merge(map2,df_plot3,by="nname")
plot3 = plot2 + 
  geom_point(data = map_plot3,mapping = aes(x=long_mid,y=lat_mid,color = difference))+
  scale_color_gradient(low = "cyan",high = "red")
plot4 = ggplot() +
  geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
  geom_sf(data = map_plot3, mapping = aes(fill = difference))+
  scale_fill_gradientn(values = c(1,0.5,0), colours = c('cyan','white','red'))+
  coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+ 
  theme_map() +
  theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10)) 
#difference by part_of_time
df_plot3_2 = df_plot %>% group_by(source,part_of_time) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
df_plot3_2 = df_plot3_2 %>% rename(source_n = n,nname=source)
df_plot3_2_temp = df_plot %>% group_by(destination,part_of_time) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'destination'. You can override using the
## `.groups` argument.
df_plot3_2$destination_n = df_plot3_2_temp$n
df_plot3_2$difference = df_plot3_2$source_n -df_plot3_2$destination_n
map_plot3_2 = merge(map2,df_plot3_2,by="nname")
plot2 + facet_grid(~part_of_time)+
  geom_point(data = map_plot3_2,mapping = aes(x=long_mid,y=lat_mid,color = difference))+
  scale_color_gradient(low = "cyan",high = "red")+
  labs(title = "Cab out-in in Boston area by part of time in a day",
           caption = "Cyan for out>in",fill="Out - in")

#Out-in difference by hour and source
df_Hour_x_Source = df_plot %>% group_by(source,hour) %>% summarise(n=sum(1/part_duration),avg_price=mean(price/distance))
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
df_Hour_x_Source = df_Hour_x_Source %>% rename(source_n = n,nname=source)
df_Hour_x_Source_temp = df_plot %>% group_by(destination,hour) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'destination'. You can override using the
## `.groups` argument.
df_Hour_x_Source$destination_n = df_Hour_x_Source_temp$n
df_Hour_x_Source$difference = df_Hour_x_Source$source_n -df_Hour_x_Source$destination_n
map_Hour_x_Source = merge(map2,df_Hour_x_Source,by="nname")

makeplot <- function(){
    datalist <- split(map_Hour_x_Source, map_Hour_x_Source$hour)
    #add overlap
    for(i in 0:23){
      datalist[[i+1]]$difference=0.75*datalist[[(i-1)%%24+1]]$difference+datalist[[i+1]]$difference+0.75*datalist[[(i+1)%%24+1]]$difference
    }
    x=lapply(datalist, function(data){
        p <- ggplot() +
          geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
          geom_sf(data = data, mapping = aes(fill = difference))+
          geom_text_repel(data=data,mapping=aes(long_mid,lat_mid,label=nname))+
          scale_fill_gradientn(values = c(1,0.5,0), colours = c('cyan','white','red'))+
          coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+ 
          theme_map() +
          theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10)) +
          labs(title = "Cab out-in in Boston area",
               caption = "In and out",fill="Out - in")+
          labs(subtitle = paste("Hour: ",data$hour[1]))
        print(p)
    })
  }
gif_file <- "gif_1.gif"
save_gif(makeplot(), gif_file, 1280, 720, delay=0.5)
## [1] "/Users/guangjitang/test1/gif_1.gif"
knitr::include_graphics(gif_file)

#price/distance by hour
ylim1 = boxplot.stats(df_plot$price/df_plot$distance)$stats[c(1, 5)]
ggplot(df_plot,aes(hour,price/distance,color=cab_type,group=source))+geom_boxplot(size=1,alpha=0.5)+
  coord_cartesian(ylim = ylim1*1.5)+
  labs(title = "Cab out-in in Boston area",
           caption = "In and out",fill="Out - in")

df_plot6 <- df_plot %>% group_by(source,part_of_time) %>% summarise(n=sum(1/part_duration)) %>% rename(nname = source)
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
map_plot6 <- left_join(map2,df_plot6,by="nname")
ggplot(data = map_plot6, mapping = aes(fill = n))+geom_sf()+facet_wrap(~part_of_time)

#hours and price by place
ggplot(map_Hour_x_Source,mapping = aes(hour,avg_price))+
  geom_point(aes(color = nname))+
  geom_smooth(map_Hour_x_Source,mapping = aes(hour,avg_price,color=nname),se=FALSE,size=1)+
  geom_smooth(map_Hour_x_Source,mapping = aes(hour,avg_price),se=TRUE,size=2)+
  labs(title = "Cab hour and price in Boston area")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#In_out and hour by place
ggplot(map_Hour_x_Source,mapping = aes(hour,difference))+
  geom_point(aes(color = nname))+
  geom_smooth(map_Hour_x_Source,mapping = aes(color=nname),se=FALSE,size=1)+
  coord_cartesian(ylim = c(-50,50))+
  labs(title = "Cab Out-in and price in Boston area by place",xlab="Out and in difference",caption = "Positive means more people here going out than going in")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#difference by hour
df_price_3 = df_price %>% group_by(source,hour) %>% summarise(n=sum(1/part_duration),avg_price=mean(price/distance))
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
df_price_3 = df_price_3 %>% rename(source_n = n,nname=source)
df_price_3_temp = df_plot %>% group_by(destination,hour) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'destination'. You can override using the
## `.groups` argument.
df_price_3$destination_n = df_price_3_temp$n
df_price_3$difference = df_price_3$source_n -df_price_3$destination_n
map_price_3 = merge(map2,df_price_3,by="nname")
#in_out and price
ggplot(map_price_3,mapping = aes(difference,avg_price))+
  geom_point()+
  geom_smooth()+
  labs(title = "Cab out-in and price in Boston area")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#in_out and price by place
ggplot(map_price_3,mapping = aes(difference,avg_price,group=nname))+
  geom_point()+
  geom_smooth()+
  labs(title = "Cab out-in and price in Boston area")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'